#--------------------------ANALIZA DANYCH ITP-------------------------------------------
#wczytanie danych
data<-read.table("mammographic_masses.data", header=FALSE, sep=",");
colnames(data) <- c('bi_rads', 'age', 'shape', 'margin', 'density', 'severity')
nan_data <- which(data=='?');
new_data <- replace(data, data=='?', NA); #zamiana ? na NA
new_data <- as.data.frame(lapply(new_data,as.numeric)) # konwersja wszystkich wartosci w dataframie na numeric
# Bindowanie kolorow do cech
featureColors <- c(bi_rads="#9ACD32",
age="#F08080",
shape="#B0E0E6",
margin="#DEB887",
density="#BA55D3",
severity="#66CDAA")
Histogramy dla poszczególnych cech w zbiorze danych:
#-------------------------histogramy pierwsze---------------------
bi_rads_h <- hist(new_data$bi_rads, main="Ocena BI-RADS", xlab="Wartosc", ylab="Ilosc wystapien", xlim=c(0,max(new_data$bi_rads, na.rm = TRUE)), ylim=c(0, 600), breaks=56, col=featureColors['bi_rads'])
zero = which(bi_rads_h$counts == 0)
text(bi_rads_h$mids[-zero],bi_rads_h$counts[-zero],labels=bi_rads_h$counts[-zero], adj=c(0.5, -0.5))
age_h <- hist(new_data$age, main="Wiek pacjentki (w latach)", xlab="Wiek", ylab="Ilosc wystapien", xlim=c(0,100), breaks=99, col=featureColors['age'])
shape_h <- hist(new_data$shape, main="Ksztalt guza", xlab="Ksztalt", ylab="Ilosc wystapien", xlim=c(0,4), ylim=c(0, 450), breaks=0:4, col=featureColors['shape'], xaxt="n")
text(shape_h$mids,shape_h$counts,labels=shape_h$counts, adj=c(0.5, -0.5))
axis(1, shape_h$mids, c("round", "oval", "lobular", "irregular"), line=-1, lwd=0, lwd.ticks = 1)
margin_h <- hist(new_data$margin, main="Margines guza", xlab="Margines", ylab="Ilosc wystapien", xlim=c(0,max(new_data$margin, na.rm = TRUE)), ylim=c(0, 400), breaks=0:5, col=featureColors['margin'], xaxt="n")
text(margin_h$mids,margin_h$counts,labels=margin_h$counts, adj=c(0.5, -0.5))
axis(1, at=margin_h$mids, labels = rep_len("", length(margin_h$mids)), lwd=0, lwd.ticks = 1, line = -1)
text(margin_h$mids, par("usr")[3] - 0.2, labels = c("circumscribed", "microlobulated", "obscured", "ill-defined", "spiculated"),
srt = 35, pos = 1, xpd = TRUE)
density_h <- hist(new_data$density, main="Gestosc guza", xlab="Gestosc guza", ylab="Ilosc wystapien", xlim=c(0,max(new_data$density, na.rm = TRUE)), ylim=c(0, 850), breaks=0:4, col=featureColors['density'], xaxt="n")
text(density_h$mids,density_h$counts,labels=density_h$counts, adj=c(0.5, -0.5))
axis(1, density_h$mids, c("high", "iso", "low", "fat-containing"), line=-1, lwd=0, lwd.ticks = 1)
severity_h <- hist(new_data$severity, right=F, main="Stopien zaawansowania guza", xlab="Zlosliwosc", ylab="Ilosc wystapien", xlim=c(0,1), ylim = c(0,600), breaks=2, col=featureColors['severity'], xaxt="n")
text(severity_h$mids,severity_h$counts,labels=severity_h$counts, adj=c(0.5, -0.5))
axis(1, severity_h$mids, c("benign", "malignant"), line=-1, lwd=0, lwd.ticks = 1)
Definicja funkcji + obliczenie statystyk zbioru:
#--------funckje wazne----------------------------
getmode <- function(v, na.rm=TRUE) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
static_values <- function(v) {
min <- min(v, na.rm = TRUE)
max <- max(v, na.rm = TRUE)
mean <- mean(v, na.rm = TRUE)
sd <- sd(v, na.rm = TRUE)
var <- var(v, na.rm = TRUE)
median <- median(v, na.rm = TRUE)
mode <- getmode(v, na.rm = TRUE)
res <- c(min, max, mean, median, mode, sd, var)
names(res) <-c('Min', 'Max', 'Mean', 'Median', 'Mode', 'Sd', 'Var')
return(res)
}
#--------------------------------------------------
#wartosci statystyczne zbioru danych
(data_stats <- sapply(new_data, static_values))
## bi_rads age shape margin density severity
## Min 0.000000 18.00000 1.000000 1.000000 1.0000000 0.0000000
## Max 55.000000 96.00000 4.000000 5.000000 4.0000000 1.0000000
## Mean 4.348279 55.48745 2.721505 2.796276 2.9107345 0.4630593
## Median 4.000000 57.00000 3.000000 3.000000 3.0000000 0.0000000
## Mode 4.000000 59.00000 4.000000 1.000000 3.0000000 0.0000000
## Sd 1.783031 14.48013 1.242792 1.566546 0.3804439 0.4988932
## Var 3.179201 209.67419 1.544532 2.454065 0.1447376 0.2488944
Wartosc bi_rads=55 powstala prawdopodobnie w wyniku bledu podczas tworzenia zbioru danych Mammographic Mass. Zakladamy, ze chodzilo o wartosc 5 i na taka podmieniamy ewidentnie bledna wartosc:
#------------------------USUNIECIE NIEKOMPLETNYCH DANYCH--------------------------------
data_clean <- new_data[complete.cases(new_data), ]
#dealing with element odstajacy
data_clean$bi_rads[data_clean$bi_rads==55] <- 5
#-------------------------histogramy po czyszczeniu---------------------
bi_rads_new_h <- hist(data_clean$bi_rads, main="Ocena BI-RADS", xlab="Wartosc", ylab="Ilosc wystapien", xlim=c(0,6), ylim=c(0, 500), breaks=-0.5:6.5, col=featureColors['bi_rads'], xaxt="n")
text(bi_rads_new_h$mids,bi_rads_new_h$counts,labels=bi_rads_new_h$counts, adj=c(0.5, -0.5))
axis(1, bi_rads_new_h$mids, seq_along(bi_rads_new_h$mids))
age_new_h <- hist(data_clean$age, main="Wiek pacjentki (w latach)", xlab="Wiek", ylab="Ilosc wystapien", xlim=c(0,100), breaks=99, col=featureColors['age'])
shape_new_h <- hist(data_clean$shape, main="Ksztalt guza", xlab="Ksztalt", ylab="Ilosc wystapien", xlim=c(0,4), ylim=c(0,400), breaks=0:4, col=featureColors['shape'], xaxt="n")
text(shape_new_h$mids,shape_new_h$counts,labels=shape_new_h$counts, adj=c(0.5, -0.5))
axis(1, shape_new_h$mids, c("round", "oval", "lobular", "irregular"), line=-1, lwd=0, lwd.ticks = 1)
margin_new_h <- hist(data_clean$margin, main="Margines guza", xlab="Margines", ylab="Ilosc wystapien", xlim=c(0,max(data_clean$margin, na.rm = TRUE)), ylim=c(0,400), breaks=0:5, col=featureColors['margin'], xaxt="n")
text(margin_new_h$mids,margin_new_h$counts,labels=margin_new_h$counts, adj=c(0.5, -0.5))
axis(1, at=margin_new_h$mids, labels = rep_len("", length(margin_new_h$mids)), lwd=0, lwd.ticks = 1, line = -1)
text(margin_new_h$mids, par("usr")[3] - 0.2, labels = c("circumscribed", "microlobulated", "obscured", "ill-defined", "spiculated"),
srt = 25, pos = 1, xpd = TRUE)
density_new_h <- hist(data_clean$density, main="Gestosc guza", xlab="Gestosc guza", ylab="Ilosc wystapien", xlim=c(0,max(data_clean$density, na.rm = TRUE)), ylim=c(0,800), breaks=0:4, col=featureColors['density'], xaxt="n")
text(density_new_h$mids,density_new_h$counts,labels=density_new_h$counts, adj=c(0.5, -0.5))
axis(1, density_h$mids, c("high", "iso", "low", "fat-containing"), line=-1, lwd=0, lwd.ticks = 1)
severity_new_h <- hist(data_clean$severity, right=F, main="Stopien zaawansowania guza", xlab="Zlosliwosc", ylab="Ilosc wystapien", xlim=c(0,1), ylim = c(0,500), breaks=2, col=featureColors['severity'], xaxt="n")
text(severity_new_h$mids,severity_new_h$counts,labels=severity_new_h$counts, adj=c(0.5, -0.5))
axis(1, severity_new_h$mids, c("benign", "malignant"), line=-1, lwd=0, lwd.ticks = 1)
bi_rads_benVSmag <- as.matrix(t(prop.table(table(data_clean[c('bi_rads', 'severity')]), 1)))
bi_rads_benVSmag <- cbind(bi_rads_benVSmag[,1], c(0.0,0.0), bi_rads_benVSmag[,-1])
colnames(bi_rads_benVSmag) <- as.character(0:6)
age_benVSmag <- t(prop.table(table(data_clean[c('age', 'severity')]), 1))
shape_benVSmag <- t(prop.table(table(data_clean[c('shape', 'severity')]), 1))
margin_benVSmag <- t(prop.table(table(data_clean[c('margin', 'severity')]), 1))
density_benVSmag <- t(prop.table(table(data_clean[c('density', 'severity')]), 1))
# barploty
par(mar = c(5.1, 4.1, 4.1, 6))
b <- barplot(bi_rads_benVSmag, col = c("seagreen3", "firebrick3"),
main = "Ocena BI-RADS vs. udzial guzow lagodnych/zlosliwych",
xlab = "Ocena BI-RADS", ylab = "%",
legend.text = c("benign", "malignant"),
args.legend = list(x = "topright", inset = c(-0.2, 0)))
text(b[2], 0.48, "BRAK", col="tomato2", srt=90, font=2)
barplot(age_benVSmag, col = c("seagreen3", "firebrick3"),
main = "Wiek pacjentki vs. udzial guzów lagodnych/zlosliwych",
xlab = "Wiek pacjentki", ylab = "%",
legend.text = c("benign", "malignant"),
args.legend = list(x = "topright", inset = c(-0.2, 0)))
b <- barplot(shape_benVSmag, col = c("seagreen3", "firebrick3"),
main = "Ksztalt masy vs. udzial guzow lagodnych/zlosliwych",
xlab = "Ksztalt masy", ylab = "%", xaxt="n",
legend.text = c("benign", "malignant"),
args.legend = list(x = "topright", inset = c(-0.2, 0)))
axis(1, b, c("round", "oval", "lobular", "irregular"), line=-1, lwd=0, lwd.ticks = 1)
b <- barplot(margin_benVSmag, col = c("seagreen3", "firebrick3"),
main = "Margines masy vs. udzial guzow lagodnych/zlosliwych",
xlab = "Margines masy", ylab = "%", xaxt="n",
legend.text = c("benign", "malignant"),
args.legend = list(x = "topright", inset = c(-0.2, 0)))
axis(1, b, rep("", length(b)), line=-1, lwd=0, lwd.ticks = 1)
text(b, par("usr")[3], labels = c("circumscribed", "microlobulated", "obscured", "ill-defined", "spiculated"),
srt = 25, pos = 1, xpd = TRUE)
b <- barplot(density_benVSmag, col = c("seagreen3", "firebrick3"),
main = "Gestosc masy vs. udzial guzow lagodnych/zlosliwych",
xlab = "Gestosc masy", ylab = "%", xaxt="n",
legend.text = c("benign", "malignant"),
args.legend = list(x = "topright", inset = c(-0.2, 0)))
axis(1, b, c("high", "iso", "low", "fat-containing"), line=-1, lwd=0, lwd.ticks = 1)
require(reshape2)
require(plotly)
cormat <- round(cor(data_clean), 2);
melted_cormat <- melt(cormat);
# Get lower triangle of the correlation matrix
get_lower_tri<-function(cormat){
cormat[upper.tri(cormat)] <- NA
return(cormat)
}
# Get upper triangle of the correlation matrix
get_upper_tri <- function(cormat){
cormat[lower.tri(cormat)]<- NA
return(cormat)
}
upper_tri <- get_upper_tri(cormat);
reorder_cormat <- function(cormat){
# Use correlation between variables as distance
dd <- as.dist((1-cormat)/2)
hc <- hclust(dd)
cormat <-cormat[hc$order, hc$order]
}
cormat <- reorder_cormat(cormat)
upper_tri <- get_upper_tri(cormat)
# Melt the correlation matrix
melted_cormat <- melt(upper_tri, na.rm = TRUE)
ggheatmap <- ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "black", high = "firebrick2", mid = "dodgerblue4",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+ # minimal theme
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
theme(axis.text.y = element_text(size = 12)) +
coord_fixed()
ggplotly(ggheatmap +
geom_text(aes(Var2, Var1, label = value), color = "white", size = 4) +
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
legend.justification = c(1, 0),
legend.position = c(0.6, 0.7),
legend.direction = "horizontal")+
guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
title.position = "top", title.hjust = 0.5)))
Do standardowego rozkładu normalnego N(0, 1), bez labeli (czyli bez cechy severity):
s <- scale(data_clean[,2:5])
scaled_centers <- attr(s, 'scaled:center')
scaled_scales <- attr(s, 'scaled:scale')
data_stand <- data.frame(age=s[,1], shape=s[,2], margin=s[,3], density=s[,4], data_clean['severity'])
head(data_stand, 10)
## age shape margin density severity
## 1 0.76460188 0.1755305 1.3953435 0.2403212 1
## 3 0.15117947 0.9804496 1.3953435 0.2403212 1
## 4 -1.89356190 -1.4343076 -1.1570204 0.2403212 0
## 9 0.08302143 -1.4343076 1.3953435 0.2403212 1
## 11 1.37802429 -1.4343076 0.7572525 0.2403212 1
## 12 -0.93934926 -0.6293885 -1.1570204 0.2403212 1
## 14 -1.34829753 0.1755305 -1.1570204 -2.6092013 0
## 15 0.28749556 -0.6293885 -1.1570204 -2.6092013 0
## 16 -0.12145271 -1.4343076 -1.1570204 0.2403212 0
## 17 -0.25776880 0.1755305 0.7572525 0.2403212 0
Na zbiorze treningowym.
library(rpart)
library(caret)
require(neuralnet)
#podział na zbior treningnowy i testowy
#sev <- unlist(data_clean['severity'])
sev <- data_clean$severity
set.seed(100)
part <- createDataPartition(sev, p = 0.8, list = FALSE)
training_set <- scale(data_clean[part, 2:5])
scaled_centers <- attr(training_set, 'scaled:center')
scaled_scales <- attr(training_set, 'scaled:scale')
testing_set <- data_clean[-part, 2:5]
training_sev <-sev[part]
testing_sev <-sev[-part]
data_stand_tr <- data.frame(age=training_set[,1], shape=training_set[,2], margin=training_set[,3], density=training_set[,4], training_sev)
data_ts <- data.frame(age=testing_set[,1], shape=testing_set[,2], margin=testing_set[,3], density=testing_set[,4], testing_sev)
nn <- neuralnet(training_sev ~ age+shape+margin+density, data=data_stand_tr,
err.fct = "sse", hidden = 2, act.fct = "logistic")
predsVStarget_tr <- data.frame(case=rownames(data_stand_tr),
predictions=factor(round(unlist(nn$net.result), digits = 0), labels = c('benign', 'malignant')),
target=factor(data_stand_tr$training_sev, labels = c('benign', 'malignant')))
pred <-round(predict(nn, scale(testing_set, center = scaled_centers, scale = scaled_scales)), 2)
predsVStarget_ts <-data.frame(case = rownames(testing_set),
predictions=factor(round(pred, digits = 0), labels = c('benign', 'malignant')),
target = factor(data_ts$testing_sev, labels = c('benign', 'malignant')))
confMatrix_tr <- confusionMatrix(data=predsVStarget_tr$predictions, reference=predsVStarget_tr$target, positive = "malignant")
confMatrix_ts <- confusionMatrix(data=predsVStarget_ts$predictions, reference=predsVStarget_ts$target, positive = "malignant")
confMatrix_tr
## Confusion Matrix and Statistics
##
## Reference
## Prediction benign malignant
## benign 271 49
## malignant 74 270
##
## Accuracy : 0.8148
## 95% CI : (0.7831, 0.8436)
## No Information Rate : 0.5196
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.63
##
## Mcnemar's Test P-Value : 0.03046
##
## Sensitivity : 0.8464
## Specificity : 0.7855
## Pos Pred Value : 0.7849
## Neg Pred Value : 0.8469
## Prevalence : 0.4804
## Detection Rate : 0.4066
## Detection Prevalence : 0.5181
## Balanced Accuracy : 0.8160
##
## 'Positive' Class : malignant
##
fourfoldplot(confMatrix_tr$table, color = c("firebrick2", "seagreen3"),
conf.level = 0, margin = 1, main = "Macierz pomylek na zbiorze trenigowym")
confMatrix_ts
## Confusion Matrix and Statistics
##
## Reference
## Prediction benign malignant
## benign 60 7
## malignant 22 77
##
## Accuracy : 0.8253
## 95% CI : (0.7588, 0.8798)
## No Information Rate : 0.506
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.6498
##
## Mcnemar's Test P-Value : 0.00933
##
## Sensitivity : 0.9167
## Specificity : 0.7317
## Pos Pred Value : 0.7778
## Neg Pred Value : 0.8955
## Prevalence : 0.5060
## Detection Rate : 0.4639
## Detection Prevalence : 0.5964
## Balanced Accuracy : 0.8242
##
## 'Positive' Class : malignant
##
fourfoldplot(confMatrix_ts$table, color = c("firebrick2", "seagreen3"),
conf.level = 0, margin = 1, main = "Macierz pomylek na zbiorze testowym")